import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.metrics import classification_report as cr
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import f_classif as fs
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsRegressor
import sklearn.metrics as metrics
df = pd.read_csv(r"F:\MIT\salesdata.csv")
df
| Customer | Final Customer name | VAT TIN # | Doc. type | Export Type | Excise Inv # | Date | Quarter | Tax Type | Tax Code | ... | Tax Code Description | Sales Type | Tax Rate | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LPO122 | LOPPAL ENTERPRISES | AA0025379 | Intercompany Billing | Domestic | 35000063.0 | 09.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 3500 | 350 | 7 | 4 | 3861 | 77 | 3938 |
| 1 | LPO122 | Jyothi Industries | 960002222 | Intercompany Billing | Domestic | 34000001.0 | 04.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 217975 | 21798 | 436 | 218 | 240426 | 4809 | 245235 |
| 2 | LPO122 | Surya Enterprise | 960002231 | Intercompany Billing | Domestic | 34000010.0 | 07.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 181696 | 18170 | 363 | 182 | 200410 | 4008 | 204419 |
| 3 | LPO122 | Cape LOPtric Private Ltd | 960002232 | Intercompany Billing | Domestic | 34000011.0 | 07.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 180000 | 18000 | 360 | 180 | 198540 | 3971 | 202511 |
| 4 | LPO122 | Crompton Greaves Limited | 960002235 | Intercompany Billing | Domestic | 34000013.0 | 09.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 37516 | 3752 | 75 | 38 | 41380 | 828 | 42208 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2388 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027865 | Intercompany Bi | Domestic | 35002646.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 4350 | 435 | 9 | 4 | 4798 | 96 | 4894 |
| 2389 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027866 | Intercompany Bi | Domestic | 35002647.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 34800 | 3480 | 70 | 35 | 38384 | 768 | 39152 |
| 2390 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027867 | Intercompany Bi | Domestic | 35002648.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 28275 | 2828 | 57 | 28 | 31187 | 624 | 31811 |
| 2391 | LPO122 | LOPPAL ENTERPRISES | AA0027864 | Intercompany Bi | Domestic | 35002645.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 32625 | 3263 | 65 | 33 | 35985 | 720 | 36705 |
| 2392 | LPO122 | DENSO | 890001035 | Intercompany Bi | Trading | NaN | 10.12.2020 | Quarter-3 | CST | A7 | ... | Trading 2 % CST | CST 2% | 2.00% | 3000 | 255 | 0 | 0 | 3351 | 67 | 3418 |
2393 rows × 21 columns
df.head()
| Customer | Final Customer name | VAT TIN # | Doc. type | Export Type | Excise Inv # | Date | Quarter | Tax Type | Tax Code | ... | Tax Code Description | Sales Type | Tax Rate | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | LPO122 | LOPPAL ENTERPRISES | AA0025379 | Intercompany Billing | Domestic | 35000063.0 | 09.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 3500 | 350 | 7 | 4 | 3861 | 77 | 3938 |
| 1 | LPO122 | Jyothi Industries | 960002222 | Intercompany Billing | Domestic | 34000001.0 | 04.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 217975 | 21798 | 436 | 218 | 240426 | 4809 | 245235 |
| 2 | LPO122 | Surya Enterprise | 960002231 | Intercompany Billing | Domestic | 34000010.0 | 07.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 181696 | 18170 | 363 | 182 | 200410 | 4008 | 204419 |
| 3 | LPO122 | Cape LOPtric Private Ltd | 960002232 | Intercompany Billing | Domestic | 34000011.0 | 07.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 180000 | 18000 | 360 | 180 | 198540 | 3971 | 202511 |
| 4 | LPO122 | Crompton Greaves Limited | 960002235 | Intercompany Billing | Domestic | 34000013.0 | 09.04.2020 | Quarter-1 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 37516 | 3752 | 75 | 38 | 41380 | 828 | 42208 |
5 rows × 21 columns
df.tail()
| Customer | Final Customer name | VAT TIN # | Doc. type | Export Type | Excise Inv # | Date | Quarter | Tax Type | Tax Code | ... | Tax Code Description | Sales Type | Tax Rate | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2388 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027865 | Intercompany Bi | Domestic | 35002646.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 4350 | 435 | 9 | 4 | 4798 | 96 | 4894 |
| 2389 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027866 | Intercompany Bi | Domestic | 35002647.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 34800 | 3480 | 70 | 35 | 38384 | 768 | 39152 |
| 2390 | LPO122 | ALPOOR TRADING CORPORATIO | AA0027867 | Intercompany Bi | Domestic | 35002648.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 28275 | 2828 | 57 | 28 | 31187 | 624 | 31811 |
| 2391 | LPO122 | LOPPAL ENTERPRISES | AA0027864 | Intercompany Bi | Domestic | 35002645.0 | 31.12.2020 | Quarter-3 | CST | A2 | ... | Excise duty +cess+CST outside state | CST 2% | 2.00% | 32625 | 3263 | 65 | 33 | 35985 | 720 | 36705 |
| 2392 | LPO122 | DENSO | 890001035 | Intercompany Bi | Trading | NaN | 10.12.2020 | Quarter-3 | CST | A7 | ... | Trading 2 % CST | CST 2% | 2.00% | 3000 | 255 | 0 | 0 | 3351 | 67 | 3418 |
5 rows × 21 columns
df.shape
(2393, 21)
df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Excise Inv # | 2349.0 | 3.485301e+07 | 355664.565423 | 34000001.0 | 35000306.0 | 35001062.0 | 35001841.0 | 35002648.0 |
| FG/ Scrap/ Cafetaria | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Manufacturing Cost | 2393.0 | 1.181299e+05 | 318842.217034 | -34000.0 | 6996.0 | 20925.0 | 61660.0 | 2392650.0 |
| Excise Duty | 2393.0 | 1.180305e+04 | 31881.634658 | -4080.0 | 700.0 | 2093.0 | 6166.0 | 239265.0 |
| Ecess | 2393.0 | 2.355976e+02 | 637.692745 | -82.0 | 14.0 | 41.0 | 122.0 | 4785.0 |
| SEcess | 2393.0 | 1.178224e+02 | 318.846800 | -41.0 | 7.0 | 21.0 | 61.0 | 2393.0 |
| Taxable Amount | 2393.0 | 1.303114e+05 | 351676.819145 | -38202.0 | 7717.0 | 23150.0 | 68011.0 | 2639093.0 |
| Tax Amount | 2393.0 | 2.605940e+03 | 7033.634260 | -764.0 | 154.0 | 462.0 | 1360.0 | 52782.0 |
| Gross Amount | 2393.0 | 1.329176e+05 | 358710.360253 | -38966.0 | 7871.0 | 23613.0 | 69371.0 | 2691875.0 |
df.corr()
| Excise Inv # | FG/ Scrap/ Cafetaria | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|
| Excise Inv # | 1.000000 | NaN | -0.592450 | -0.592450 | -0.592460 | -0.592475 | -0.592439 | -0.592449 | -0.592439 |
| FG/ Scrap/ Cafetaria | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Manufacturing Cost | -0.592450 | NaN | 1.000000 | 0.999959 | 0.999645 | 0.999644 | 0.999998 | 1.000000 | 0.999998 |
| Excise Duty | -0.592450 | NaN | 0.999959 | 1.000000 | 0.999843 | 0.999842 | 0.999959 | 0.999961 | 0.999959 |
| Ecess | -0.592460 | NaN | 0.999645 | 0.999843 | 1.000000 | 0.999999 | 0.999648 | 0.999650 | 0.999648 |
| SEcess | -0.592475 | NaN | 0.999644 | 0.999842 | 0.999999 | 1.000000 | 0.999648 | 0.999650 | 0.999648 |
| Taxable Amount | -0.592439 | NaN | 0.999998 | 0.999959 | 0.999648 | 0.999648 | 1.000000 | 0.999998 | 1.000000 |
| Tax Amount | -0.592449 | NaN | 1.000000 | 0.999961 | 0.999650 | 0.999650 | 0.999998 | 1.000000 | 0.999998 |
| Gross Amount | -0.592439 | NaN | 0.999998 | 0.999959 | 0.999648 | 0.999648 | 1.000000 | 0.999998 | 1.000000 |
print('Rows :',df.shape[0])
print('Columns :',df.shape[1])
print('\nFeatures :\n :',df.columns.tolist())
print('\nMissing values :',df.isnull().values.sum()) #for finding null values in the following dataset.
print('\nUnique values : \n',df.nunique())
print('\nStatical Measure of the data set :\n',df.describe()) # Statically computed values of the dataset.
Rows : 2393
Columns : 21
Features :
: ['Customer', 'Final Customer name', 'VAT TIN #', 'Doc. type', 'Export Type', 'Excise Inv #', 'Date', 'Quarter', 'Tax Type', 'Tax Code', 'FG/ Scrap/ Cafetaria', 'Tax Code Description', 'Sales Type', 'Tax Rate', 'Manufacturing Cost', 'Excise Duty', 'Ecess', 'SEcess', 'Taxable Amount', 'Tax Amount', 'Gross Amount']
Missing values : 2439
Unique values :
Customer 1
Final Customer name 35
VAT TIN # 2393
Doc. type 4
Export Type 2
Excise Inv # 2349
Date 212
Quarter 3
Tax Type 1
Tax Code 3
FG/ Scrap/ Cafetaria 0
Tax Code Description 3
Sales Type 1
Tax Rate 1
Manufacturing Cost 1197
Excise Duty 1170
Ecess 454
SEcess 343
Taxable Amount 1201
Tax Amount 992
Gross Amount 1201
dtype: int64
Statical Measure of the data set :
Excise Inv # FG/ Scrap/ Cafetaria Manufacturing Cost Excise Duty \
count 2.349000e+03 0.0 2.393000e+03 2393.000000
mean 3.485301e+07 NaN 1.181299e+05 11803.049728
std 3.556646e+05 NaN 3.188422e+05 31881.634658
min 3.400000e+07 NaN -3.400000e+04 -4080.000000
25% 3.500031e+07 NaN 6.996000e+03 700.000000
50% 3.500106e+07 NaN 2.092500e+04 2093.000000
75% 3.500184e+07 NaN 6.166000e+04 6166.000000
max 3.500265e+07 NaN 2.392650e+06 239265.000000
Ecess SEcess Taxable Amount Tax Amount Gross Amount
count 2393.000000 2393.000000 2.393000e+03 2393.000000 2.393000e+03
mean 235.597576 117.822399 1.303114e+05 2605.939824 1.329176e+05
std 637.692745 318.846800 3.516768e+05 7033.634260 3.587104e+05
min -82.000000 -41.000000 -3.820200e+04 -764.000000 -3.896600e+04
25% 14.000000 7.000000 7.717000e+03 154.000000 7.871000e+03
50% 41.000000 21.000000 2.315000e+04 462.000000 2.361300e+04
75% 122.000000 61.000000 6.801100e+04 1360.000000 6.937100e+04
max 4785.000000 2393.000000 2.639093e+06 52782.000000 2.691875e+06
df = df.drop(columns=['Customer',
'Final Customer name',
'VAT TIN #',
'Doc. type',
'Export Type',
'Excise Inv #',
'Tax Type',
'Tax Code',
'FG/ Scrap/ Cafetaria',
'Tax Code Description',
'Sales Type',
'Tax Rate',
])
df
| Date | Quarter | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 09.04.2020 | Quarter-1 | 3500 | 350 | 7 | 4 | 3861 | 77 | 3938 |
| 1 | 04.04.2020 | Quarter-1 | 217975 | 21798 | 436 | 218 | 240426 | 4809 | 245235 |
| 2 | 07.04.2020 | Quarter-1 | 181696 | 18170 | 363 | 182 | 200410 | 4008 | 204419 |
| 3 | 07.04.2020 | Quarter-1 | 180000 | 18000 | 360 | 180 | 198540 | 3971 | 202511 |
| 4 | 09.04.2020 | Quarter-1 | 37516 | 3752 | 75 | 38 | 41380 | 828 | 42208 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2388 | 31.12.2020 | Quarter-3 | 4350 | 435 | 9 | 4 | 4798 | 96 | 4894 |
| 2389 | 31.12.2020 | Quarter-3 | 34800 | 3480 | 70 | 35 | 38384 | 768 | 39152 |
| 2390 | 31.12.2020 | Quarter-3 | 28275 | 2828 | 57 | 28 | 31187 | 624 | 31811 |
| 2391 | 31.12.2020 | Quarter-3 | 32625 | 3263 | 65 | 33 | 35985 | 720 | 36705 |
| 2392 | 10.12.2020 | Quarter-3 | 3000 | 255 | 0 | 0 | 3351 | 67 | 3418 |
2393 rows × 9 columns
df.dtypes
Date object Quarter object Manufacturing Cost int64 Excise Duty int64 Ecess int64 SEcess int64 Taxable Amount int64 Tax Amount int64 Gross Amount int64 dtype: object
sns.heatmap(df.isnull())
<AxesSubplot:>
df.isnull().sum()
Date 0 Quarter 0 Manufacturing Cost 0 Excise Duty 0 Ecess 0 SEcess 0 Taxable Amount 0 Tax Amount 0 Gross Amount 0 dtype: int64
df.dropna(inplace=True)
print(df)
Date Quarter Manufacturing Cost Excise Duty Ecess SEcess \
0 09.04.2020 Quarter-1 3500 350 7 4
1 04.04.2020 Quarter-1 217975 21798 436 218
2 07.04.2020 Quarter-1 181696 18170 363 182
3 07.04.2020 Quarter-1 180000 18000 360 180
4 09.04.2020 Quarter-1 37516 3752 75 38
... ... ... ... ... ... ...
2388 31.12.2020 Quarter-3 4350 435 9 4
2389 31.12.2020 Quarter-3 34800 3480 70 35
2390 31.12.2020 Quarter-3 28275 2828 57 28
2391 31.12.2020 Quarter-3 32625 3263 65 33
2392 10.12.2020 Quarter-3 3000 255 0 0
Taxable Amount Tax Amount Gross Amount
0 3861 77 3938
1 240426 4809 245235
2 200410 4008 204419
3 198540 3971 202511
4 41380 828 42208
... ... ... ...
2388 4798 96 4894
2389 38384 768 39152
2390 31187 624 31811
2391 35985 720 36705
2392 3351 67 3418
[2393 rows x 9 columns]
df.isnull().sum()
Date 0 Quarter 0 Manufacturing Cost 0 Excise Duty 0 Ecess 0 SEcess 0 Taxable Amount 0 Tax Amount 0 Gross Amount 0 dtype: int64
df[df==0].count()
Date 0 Quarter 0 Manufacturing Cost 0 Excise Duty 0 Ecess 42 SEcess 44 Taxable Amount 0 Tax Amount 0 Gross Amount 0 dtype: int64
df['Ecess']=df['Ecess'].replace(0,df['Ecess'].mean())
df['SEcess']=df['SEcess'].replace(0,df['SEcess'].mean())
df[df==0].count()
Date 0 Quarter 0 Manufacturing Cost 0 Excise Duty 0 Ecess 0 SEcess 0 Taxable Amount 0 Tax Amount 0 Gross Amount 0 dtype: int64
Data Visualization
#!pip install autoviz
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
Imported AutoViz_Class version: 0.0.81. Call using:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=0,
lowess=False,chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30)
Note: verbose=0 or 1 generates charts and displays them in your local Jupyter notebook.
verbose=2 saves plots in your local machine under AutoViz_Plots directory and does not display charts.
#pip install pandas-profiling
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
filename = (r"F:\MIT\salesdata.csv")
sep = ","
dft = AV.AutoViz(
filename,
sep=",",
depVar="",
dfte=None,
header=1,
verbose=0,
lowess=False,
chart_format="svg",
max_rows_analyzed=2393,
max_cols_analyzed=21,
)
Shape of your Data Set: (2393, 21)
############## C L A S S I F Y I N G V A R I A B L E S ####################
Classifying variables in data set...
Number of Numeric Columns = 1
Number of Integer-Categorical Columns = 7
Number of String-Categorical Columns = 3
Number of Factor-Categorical Columns = 0
Number of String-Boolean Columns = 1
Number of Numeric-Boolean Columns = 0
Number of Discrete String Columns = 2
Number of NLP String Columns = 1
Number of Date Time Columns = 0
Number of ID Columns = 1
Number of Columns to Delete = 5
21 Predictors classified...
This does not include the Target column(s)
9 variables removed since they were ID or low-information variables
Since Number of Rows in data 2393 exceeds maximum, randomly sampling 2393 rows for EDA...
Time to run AutoViz (in seconds) = 7.353 ###################### VISUALIZATION Completed ########################
Pandas_Profiling
import pandas_profiling
report = pandas_profiling.ProfileReport(df)
report
df.hist(bins=50, figsize=(20,20))
array([[<AxesSubplot:title={'center':'Manufacturing Cost'}>,
<AxesSubplot:title={'center':'Excise Duty'}>,
<AxesSubplot:title={'center':'Ecess'}>],
[<AxesSubplot:title={'center':'SEcess'}>,
<AxesSubplot:title={'center':'Taxable Amount'}>,
<AxesSubplot:title={'center':'Tax Amount'}>],
[<AxesSubplot:title={'center':'Gross Amount'}>, <AxesSubplot:>,
<AxesSubplot:>]], dtype=object)
Label Encoding
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
df['Quarter']= le.fit_transform(df['Quarter'])
df
| Date | Quarter | Manufacturing Cost | Excise Duty | Ecess | SEcess | Taxable Amount | Tax Amount | Gross Amount | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 09.04.2020 | 0 | 3500 | 350 | 7.000000 | 4.000000 | 3861 | 77 | 3938 |
| 1 | 04.04.2020 | 0 | 217975 | 21798 | 436.000000 | 218.000000 | 240426 | 4809 | 245235 |
| 2 | 07.04.2020 | 0 | 181696 | 18170 | 363.000000 | 182.000000 | 200410 | 4008 | 204419 |
| 3 | 07.04.2020 | 0 | 180000 | 18000 | 360.000000 | 180.000000 | 198540 | 3971 | 202511 |
| 4 | 09.04.2020 | 0 | 37516 | 3752 | 75.000000 | 38.000000 | 41380 | 828 | 42208 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2388 | 31.12.2020 | 2 | 4350 | 435 | 9.000000 | 4.000000 | 4798 | 96 | 4894 |
| 2389 | 31.12.2020 | 2 | 34800 | 3480 | 70.000000 | 35.000000 | 38384 | 768 | 39152 |
| 2390 | 31.12.2020 | 2 | 28275 | 2828 | 57.000000 | 28.000000 | 31187 | 624 | 31811 |
| 2391 | 31.12.2020 | 2 | 32625 | 3263 | 65.000000 | 33.000000 | 35985 | 720 | 36705 |
| 2392 | 10.12.2020 | 2 | 3000 | 255 | 235.597576 | 117.822399 | 3351 | 67 | 3418 |
2393 rows × 9 columns
#acc = []
#model = []
#x = df[['Month No','Manufacturing Cost','Excise Duty','Ecess','SEcess','Taxable Amount','Tax Amount']]
#y = df[['Gross Amount']]
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
KNeighborsRegressor
#k = KNeighborsRegressor(n_neighbors=3,p=2)
#k
#k.fit(x_train,y_train)
#print("Euclidian Distance Score")
#k.score(x_test,y_test)
#k2 = KNeighborsRegressor(n_neighbors=3,p=1)
#k2.fit(x_train,y_train)
#k2.fit(x_train,y_train)
#print("Manhattan Distance Score")
#k2.score(x_test,y_test)
#k_values = np.arange(1,1501,2)
#k_values
#train_score_arr = []
#val_score_arr = []
#for k in k_values:
#model = KNeighborsRegressor(n_neighbors=k)
#model.fit(x_train,y_train)
# We evaluate the performance of model, for each k - value
#train_score = model.score(x_train, y_train)
#train_score_arr.append(train_score*100)
#val_score = model.score(x_test, y_test)
#val_score_arr.append(val_score*100)
#print("k=%d, train_accuracy=%.2f%%, test_accuracy=%.2f%%" % (k, train_score * 100, val_score*100))
#plt.plot(k_values,train_score_arr,'g')
#plt.plot(k_values,val_score_arr,'r')
#Support Vector Machine (SVM)
#from sklearn.svm import SVC
#SVM = SVC(gamma='auto')
#SVM.fit(x_train,y_train)
#predicted_values = SVM.predict(x_test)
#x = metrics.accuracy_score(y_test, predicted_values)
#acc.append(x)
#model.append('SVM')
#print("SVM's Accuracy is: ", x)
#print(classification_report(y_test,predicted_values))
LazyRegressor
#pip install lazypredict
#from lazypredict.Supervised import LazyRegressor
#reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
#models, predictions = reg.fit(x_train, x_test, y_train, y_test)
#print(models)
Regression
#from sklearn.linear_model import LinearRegression
#reggre = LinearRegression()
#reggre.fit(x_train, y_train)
#reggre.score(x_test, y_test)
#reggre.intercept_
#reggre.coef_
Time Series
df1 = df.drop(columns=[
'Manufacturing Cost',
'Excise Duty',
'Ecess',
'SEcess',
'Taxable Amount',
'Tax Amount',
'Quarter',
])
df1
| Date | Gross Amount | |
|---|---|---|
| 0 | 09.04.2020 | 3938 |
| 1 | 04.04.2020 | 245235 |
| 2 | 07.04.2020 | 204419 |
| 3 | 07.04.2020 | 202511 |
| 4 | 09.04.2020 | 42208 |
| ... | ... | ... |
| 2388 | 31.12.2020 | 4894 |
| 2389 | 31.12.2020 | 39152 |
| 2390 | 31.12.2020 | 31811 |
| 2391 | 31.12.2020 | 36705 |
| 2392 | 10.12.2020 | 3418 |
2393 rows × 2 columns
df1.dtypes
Date object Gross Amount int64 dtype: object
df1.isnull().sum()
Date 0 Gross Amount 0 dtype: int64
df1[df1==0].count()
Date 0 Gross Amount 0 dtype: int64
# Convert Month into Datetime
df1['Date']=pd.to_datetime(df1['Date'])
df1.head()
| Date | Gross Amount | |
|---|---|---|
| 0 | 2020-09-04 | 3938 |
| 1 | 2020-04-04 | 245235 |
| 2 | 2020-07-04 | 204419 |
| 3 | 2020-07-04 | 202511 |
| 4 | 2020-09-04 | 42208 |
df1.set_index('Date',inplace=True)
df1.head()
| Gross Amount | |
|---|---|
| Date | |
| 2020-09-04 | 3938 |
| 2020-04-04 | 245235 |
| 2020-07-04 | 204419 |
| 2020-07-04 | 202511 |
| 2020-09-04 | 42208 |
df1
| Gross Amount | |
|---|---|
| Date | |
| 2020-09-04 | 3938 |
| 2020-04-04 | 245235 |
| 2020-07-04 | 204419 |
| 2020-07-04 | 202511 |
| 2020-09-04 | 42208 |
| ... | ... |
| 2020-12-31 | 4894 |
| 2020-12-31 | 39152 |
| 2020-12-31 | 31811 |
| 2020-12-31 | 36705 |
| 2020-10-12 | 3418 |
2393 rows × 1 columns
df1.plot
df1['Gross Amount'].plot()
<AxesSubplot:xlabel='Date'>
df1['Gross Amount'].shift(1)
Date
2020-09-04 NaN
2020-04-04 3938.0
2020-07-04 245235.0
2020-07-04 204419.0
2020-09-04 202511.0
...
2020-12-31 13231.0
2020-12-31 4894.0
2020-12-31 39152.0
2020-12-31 31811.0
2020-10-12 36705.0
Name: Gross Amount, Length: 2393, dtype: float64
df1['Seasonal First Difference']=df1['Gross Amount']-df1['Gross Amount'].shift(12)
df1.head(14)
| Gross Amount | Seasonal First Difference | |
|---|---|---|
| Date | ||
| 2020-09-04 | 3938 | NaN |
| 2020-04-04 | 245235 | NaN |
| 2020-07-04 | 204419 | NaN |
| 2020-07-04 | 202511 | NaN |
| 2020-09-04 | 42208 | NaN |
| 2020-09-04 | 1712407 | NaN |
| 2020-09-04 | 1942065 | NaN |
| 2020-12-04 | 91524 | NaN |
| 2020-12-04 | 33752 | NaN |
| 2020-12-04 | 78585 | NaN |
| 2020-12-04 | 1464828 | NaN |
| 2020-04-15 | 1942065 | NaN |
| 2020-04-15 | 1725932 | 1721994.0 |
| 2020-04-18 | 1688819 | 1443584.0 |
df1['Seasonal First Difference'].plot()
<AxesSubplot:xlabel='Date'>
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df1['Gross Amount'])
plt.show()
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
fig = plt.figure(figsize=(12,8))
ax1 = fig.add_subplot(211)
fig = sm.graphics.tsa.plot_acf(df1['Seasonal First Difference'].iloc[13:],lags=40,ax=ax1)
ax2 = fig.add_subplot(212)
fig = sm.graphics.tsa.plot_pacf(df1['Seasonal First Difference'].iloc[13:],lags=40,ax=ax2)
# For non-seasonal data
#p=1, d=1, q=0 or 1
from statsmodels.tsa.arima_model import ARIMA
model=ARIMA(df['Gross Amount'],order=(1,1,1))
model_fit=model.fit()
model_fit.summary()
| Dep. Variable: | D.Gross Amount | No. Observations: | 2392 |
|---|---|---|---|
| Model: | ARIMA(1, 1, 1) | Log Likelihood | -33662.178 |
| Method: | css-mle | S.D. of innovations | 312896.088 |
| Date: | Mon, 17 May 2021 | AIC | 67332.357 |
| Time: | 10:34:29 | BIC | 67355.476 |
| Sample: | 1 | HQIC | 67340.769 |
| coef | std err | z | P>|z| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| const | -0.2174 | 1010.942 | -0.000 | 1.000 | -1981.627 | 1981.192 |
| ar.L1.D.Gross Amount | 0.0412 | 0.024 | 1.705 | 0.088 | -0.006 | 0.089 |
| ma.L1.D.Gross Amount | -0.8490 | 0.013 | -67.792 | 0.000 | -0.874 | -0.824 |
| Real | Imaginary | Modulus | Frequency | |
|---|---|---|---|---|
| AR.1 | 24.2628 | +0.0000j | 24.2628 | 0.0000 |
| MA.1 | 1.1779 | +0.0000j | 1.1779 | 0.0000 |